# -*- coding: utf-8 -*-
# article_info.py
# Created by Hardy on 26th, Jan
# Copyright 2017 杭州网川教育有限公司. All rights reserved.

import heapq
import json
import operator

from querier.esquerier import ElasticSearchQuerier
import utils.utils as utils
import math
import re
import datetime
import html
from utils.wechat_article_dedup import article_deduplicate


DAYS = 15
MINIMUM_SHOULD_MATCH = '5<85% 10<9'
MAX_CHARACTER = 30
CATEGORY_CUTOFF = 0.9
MAX_KEYWORDS = 10


class SohuArticleSearchMiniQuerier(ElasticSearchQuerier):

    def __init__(self, es, index, doc_type, nlp_service=None):
        super(SohuArticleSearchMiniQuerier, self).__init__(es, index, doc_type)
        self.nlp_service = nlp_service

    def _build_query(self, args):
        term = args.get('term', '')
        term = term if term else ''
        filters = args.get('filters', {})
        if filters is None:
            filters = {}
        order = args.get('order_by', utils.ORDER_OVERALL)
        from_ = args.get('from', 0)
        size_ = args.get('size', 10)
        highlight = args.get('highlight', False)

        deduplicate = args.get('deduplicate', False)

        term2, keywords, ex_keywords, weights = utils.process_query_term(term, self.nlp_service, 'keywords',
                                                                         allowed_num_words=MAX_KEYWORDS)

        query = self._genQuery(' '.join(keywords), term, filters, order, from_, size_, highlight)

        return query, {}, {'order': order, 'deduplicate': deduplicate}

    def _build_result(self, es_result, param):
        order = param['order']
        deduplicate = param['deduplicate']
        total = es_result['hits']['total']
        articles = []
        for hit in es_result['hits']['hits']:
            articles.append(self.extractResult(hit, order))

        if deduplicate:
            try:
                articles = article_deduplicate(articles)
            except Exception as e:
                pass

        return {
            'total': total,
            'articles': articles
        }

    def _genQuery(self, query_keywords, term, filters, order, from_, size_, highlight):
        must_clause = []
        should_clause = []
        filter_clause = []
        if filters:
            # filter_clause = self._add_filter_clause(filter_clause, filters, 'author', 'should')
            # filter_clause = self._add_filter_match(filter_clause, filters, 'biz_code', 'should')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'publish_timestamp')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'read_num')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'comment_num')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'image_num')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'video_num')
            filter_clause = self._addFilterRangeClause(filter_clause, filters, 'text_len')

            if filters.get('category'):
                filters['category'] = [utils.category_smzdm_2_encode(c) for c in filters['category']]
                # filters['category_weight'] = [CATEGORY_CUTOFF]
                filter_clause = self._addFilterClause(filter_clause, filters, 'category', 'should')
                # filter_clause = self._add_filter_range_clause(filter_clause, filters, 'category_weight')

        if query_keywords.strip():
            must_clause.append(
                {
                    'multi_match': {
                        'analyzer': 'whitespace',
                        'query': query_keywords,
                        'fields': ['title_seg^3'],
                        # 'minimum_should_match': ""
                    }
                }
            )

        if term:
            term = term.strip()
            if len(term) <= 5:
                should_clause.append(
                    {
                        'match_phrase': {
                            "title": {
                                'query': term[0:MAX_CHARACTER],
                                'slop': 1,
                                'boost': 3,
                            },

                        },
                        # 'match_phrase': {
                        #     "biz_name": {
                        #         'query': term[0:MAX_CHARACTER],
                        #         'slop': 2,
                        #         'boost': 30,
                        #     },
                        #
                        # },
                    }
                )
            else:
                should_clause.append(
                    {
                        'match': {
                            "title": {
                                'query': term[0:MAX_CHARACTER],
                                'boost': 3,
                                'minimum_should_match': MINIMUM_SHOULD_MATCH
                            },

                        }
                    }
                )

        query = {"query": {
            "bool": {
                # "must": must_clause,
                # "should": should_clause,
                "filter": filter_clause,
                # "must": {'bool': {}},
                # "minimum_should_match": 1
            }
        }, 'from': from_, 'size': size_}

        if must_clause:
            query['query']['bool']['must'] = must_clause

        if should_clause:
            query['query']['bool']['should'] = should_clause
            query['query']['bool']['minimum_should_match'] = 1

        if order == 'relative':
            query['sort'] = [
                '_score',
                {'publish_timestamp': 'desc'}
            ]
            # if filters.get('category'):
            #     query['sort'] = [{'category_weight': 'desc'}] + query['sort']
        elif order == 'read_num':
            query['sort'] = [
                # '_score',
                {'read_num': 'desc'}
            ]
        elif order == 'comment_num':
            query['sort'] = [
                # '_score',
                {'comment_num': 'desc'}
            ]
        else:
            query['sort'] = [
                {'publish_timestamp': 'desc'},
                '_score'
            ]
            # if filters.get('category'):
            #     query['sort'] += [{'category_weight': 'desc'}]

        query['track_scores'] = True
        if highlight:
            query['highlight'] = {
                "pre_tags": ["<span class='keyword'>"],
                "post_tags": ["</span>"],
                "fields": {"title": {}}
            }
        # else:
        #     query['highlight'] = {
        #         "pre_tags": [""],
        #         "post_tags": [""],
        #         "fields": {"keywords": {}, "title_seg": {}}
        #     }

        return query

    def _addFilterMatch(self, must_clause, filters, key, cond='must'):
        if key in filters:
            if filters[key]:
                clause = []
                must_clause.append({
                    'bool': {cond: clause}
                })
                values = filters[key]
                if isinstance(values, str):
                    values = values.split(' ')
                for fk in values:
                    clause.append({'match': {key: {'query': fk, 'minimum_should_match': '20<100% 20<20'}}})
        return must_clause

    def _addFilterClause(self, filter_clause, filters, key, cond='must'):
        if key in filters:
            if filters[key]:
                clause = []
                filter_clause.append({
                    'bool': {
                        cond: clause
                    }
                })
                for fk in filters[key]:
                    clause.append({'term': {key: fk}})
        return filter_clause

    def _addFilterRangeClause(self, filter_clause, filters, key):
        if key in filters:
            if filters[key]:
                clause = []
                filter_clause.append({
                    'bool': {
                        'must': clause
                    }
                })
                fk = filters[key]
                if not isinstance(fk, list) or len(fk) < 1:
                    pass
                else:
                    min_fk = fk[0]
                    if len(fk) >= 2:
                        max_fk = fk[1]
                    else:
                        max_fk = None
                    if min_fk is not None and min_fk != 'null':
                        clause.append({'range': {key: {"gte": min_fk}}})
                    if max_fk is not None and max_fk != 'null':
                        clause.append({'range': {key: {"lte": max_fk}}})
        return filter_clause

    def extractResult(self, hit, order):
        source_ = dict(hit.get('_source', {}))
        keywords = source_['keywords']

        highlight = hit.get('highlight')
        h_keywords = []

        h_title = source_['title']
        if highlight:
            h_title = highlight.get('title')
            if h_title:
                h_title = h_title[0]
            else:
                h_title = source_['title']

        h_keywords = h_keywords if h_keywords else keywords[0:10]

        h_kv = {}

        for i in range(0, len(h_keywords)):
            if h_kv.get(h_keywords[i]) is None:
                h_kv[h_keywords[i]] = i

        h_keywords = [k[0] for k in sorted(h_kv.items(), key=operator.itemgetter(1))]

        return {
            'url': source_.get('url', ''),
            'abstract': source_.get('abstract'),
            'author': source_.get('author', ''),
            'category': utils.category_smzdm_2_decode(source_.get('category', -1)),
            'comment_num': source_.get('comment_num', 0),
            'read_num': source_.get('read_num', 0),
            'keywords': h_keywords,
            'msg_cdn_url': source_.get('msg_cdn_url', ''),
            'text_len':source_.get('text_len', 0),
            'title': utils.clean_text(h_title),
            'title_seg': source_.get('title_seg', []),
            'video_num': source_.get('video_num', 0),
            'image_num': source_.get('image_num', 0),
            'article_id': source_.get('article_id', ''),
            'publish_timestamp': source_.get('publish_timestamp', ''),
            'crawler_timestamp': source_.get('crawler_timestamp', '')
        }
